# this next line shouldn't have to be here
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from pandas.plotting import register_matplotlib_converters
from ydata_profiling import ProfileReport
register_matplotlib_converters()
sns.set()
sns.set_context("notebook")
plt.rcParams["figure.figsize"] = 10, 6
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.precision = 4
warnings.simplefilter(action="ignore", category=FutureWarning)
dollar_formatter = FuncFormatter(lambda x, pos: f"${x:,.0f}")
thousands_formatter = FuncFormatter(lambda x, pos: f"{x:,.0f}")Hockey Goals
Fill in a module description here
Imports
Constants
DAYS_IN_YEAR = 365.25Data
Raw
game_goals_raw = pd.read_csv(
"https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/game_goals.csv"
)
top_250_raw = pd.read_csv(
"https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/top_250.csv"
)
season_goals_raw = pd.read_csv(
"https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/season_goals.csv"
)game_goals_raw.head()| player | season | rank | date | game_num | age | team | at | opp | location | outcome | goals | assists | points | plus_minus | penalty_min | goals_even | goals_powerplay | goals_short | goals_gamewinner | assists_even | assists_powerplay | assists_short | shots | shot_percent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Alex Ovechkin | 2006 | 1 | 2005-10-05 | 1 | 20-018 | WSH | NaN | CBJ | Home | W | 2 | 0 | 2 | 1 | 2 | 1 | 1 | 0 | 0 | NaN | NaN | NaN | 5 | 40.0 |
| 1 | Alex Ovechkin | 2006 | 2 | 2005-10-07 | 2 | 20-020 | WSH | NaN | ATL | Home | L | 0 | 1 | 1 | -2 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 1 | 0.0 |
| 2 | Alex Ovechkin | 2006 | 3 | 2005-10-08 | 3 | 20-021 | WSH | @ | ATL | Away | L | 0 | 1 | 1 | 0 | 4 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 3 | 0.0 |
| 3 | Alex Ovechkin | 2006 | 4 | 2005-10-10 | 4 | 20-023 | WSH | NaN | NYR | Home | W | 1 | 0 | 1 | 1 | 2 | 0 | 1 | 0 | 1 | NaN | NaN | NaN | 6 | 16.7 |
| 4 | Alex Ovechkin | 2006 | 5 | 2005-10-12 | 5 | 20-025 | WSH | @ | CAR | Away | L | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | NaN | NaN | NaN | 6 | 16.7 |
top_250_raw.head()| raw_rank | player | years | total_goals | url_number | raw_link | link | active | yr_start | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | Wayne Gretzky | 1979-99 | 894 | 1 | /players/g/gretzwa01.html | https://www.hockey-reference.com/players/g/gre... | Retired | 1979 |
| 1 | 2.0 | Gordie Howe | 1946-80 | 801 | 2 | /players/h/howego01.html | https://www.hockey-reference.com/players/h/how... | Retired | 1946 |
| 2 | 3.0 | Jaromir Jagr | 1990-18 | 766 | 3 | /players/j/jagrja01.html | https://www.hockey-reference.com/players/j/jag... | Retired | 1990 |
| 3 | 4.0 | Brett Hull | 1986-06 | 741 | 4 | /players/h/hullbr01.html | https://www.hockey-reference.com/players/h/hul... | Retired | 1986 |
| 4 | 5.0 | Marcel Dionne | 1971-89 | 731 | 5 | /players/d/dionnma01.html | https://www.hockey-reference.com/players/d/dio... | Retired | 1971 |
season_goals_raw.head()| rank | position | hand | player | years | total_goals | status | yr_start | season | age | team | league | season_games | goals | assists | points | plus_minus | penalty_min | goals_even | goals_power_play | goals_short_handed | goals_game_winner | headshot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | C | Left | Wayne Gretzky | 1979-99 | 894 | Retired | 1979 | 1978-79 | 18 | TOT | WHA | 80 | 46 | 64 | 110 | 20.0 | 19 | NaN | NaN | NaN | NaN | https://d9kjk42l7bfqz.cloudfront.net/req/20191... |
| 1 | 1 | C | Left | Wayne Gretzky | 1979-99 | 894 | Retired | 1979 | 1978-79 | 18 | INR | WHA | 8 | 3 | 3 | 6 | -3.0 | 0 | 3.0 | 0.0 | 0.0 | NaN | https://d9kjk42l7bfqz.cloudfront.net/req/20191... |
| 2 | 1 | C | Left | Wayne Gretzky | 1979-99 | 894 | Retired | 1979 | 1978-79 | 18 | EDO | WHA | 72 | 43 | 61 | 104 | 23.0 | 19 | 34.0 | 9.0 | 0.0 | NaN | https://d9kjk42l7bfqz.cloudfront.net/req/20191... |
| 3 | 1 | C | Left | Wayne Gretzky | 1979-99 | 894 | Retired | 1979 | 1979-80 | 19 | EDM | NHL | 79 | 51 | 86 | 137 | 14.0 | 21 | 37.0 | 13.0 | 1.0 | 6.0 | https://d9kjk42l7bfqz.cloudfront.net/req/20191... |
| 4 | 1 | C | Left | Wayne Gretzky | 1979-99 | 894 | Retired | 1979 | 1980-81 | 20 | EDM | NHL | 80 | 55 | 109 | 164 | 41.0 | 28 | 36.0 | 15.0 | 4.0 | 3.0 | https://d9kjk42l7bfqz.cloudfront.net/req/20191... |
Functions
def raw_game_to_clean(raw):
return raw.assign(
years_old=(
(
((raw["age"].str.split("-").str[0].astype(float)) * DAYS_IN_YEAR)
+ raw["age"].str.split("-").str[1].astype(float)
)
/ DAYS_IN_YEAR
)
)
def raw_top_to_clean(raw):
# prob ffill raw_rank
return raw
def raw_season_to_clean(raw):
return raw
def list_of_top_n_playernames(top_n=8):
return top_250.iloc[:top_n]["player"].to_list()game_goals = raw_game_to_clean(game_goals_raw)
top_250 = raw_top_to_clean(top_250_raw)
season_goals = raw_season_to_clean(season_goals_raw)EDA
game_profile = ProfileReport(game_goals, config_file="config_minimal.yaml")
top_profile = ProfileReport(top_250, config_file="config_minimal.yaml")
season_profile = ProfileReport(season_goals, config_file="config_minimal.yaml")game_profile<class 'ydata_profiling.profile_report.ProfileReport'>.__repr__ returned empty string
top_profile<class 'ydata_profiling.profile_report.ProfileReport'>.__repr__ returned empty string
season_profile<class 'ydata_profiling.profile_report.ProfileReport'>.__repr__ returned empty string
Plots
Warning – dataset might have problems – the sum of the goals in the seasons don’t match the top dataset?
for player_name in list_of_top_n_playernames(top_n=8):
sum_goals = season_goals[season_goals["player"] == player_name]["goals"].sum()
top_goals = top_250[top_250["player"] == player_name]["total_goals"].values[0]
print(f"{top_goals - sum_goals}, {player_name}")-115, Wayne Gretzky
-174, Gordie Howe
-64, Jaromir Jagr
-32, Brett Hull
-28, Marcel Dionne
-35, Phil Esposito
-139, Mike Gartner
0, Alex Ovechkin
What do the cumulative goals scored look like as a function of age for the top scorers?
# player_name = "Alex Ovechkin"
fig, ax = plt.subplots(figsize=(10, 6))
fig.patch.set_facecolor("w")
for player_name in list_of_top_n_playernames(top_n=8):
plotdf = game_goals[game_goals["player"] == player_name]
ax.plot(plotdf["years_old"], plotdf["goals"].cumsum(), label=player_name)
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
ax.set_title("Total Goals Scored by Age")
ax.set_xlabel("Age [Years]")
ax.set_ylabel("Goals Scored")
fig.tight_layout()